# Load packages
library(dplyr)
library(e1071)
# Load the data
data <- read.csv("../who_data.csv")
# Remove irrelevant columns
who_data <- subset(data, select=-c(X,X.1,X.2,X.3,who_ms))
# Replace literal "NA" strings with proper R NA
who_data[who_data == "NA"] <- NA
# Convert numeric-like columns stored as character to numeric
num_cols <- c("pm10_concentration","pm25_concentration","no2_concentration",
"pm10_tempcov","pm25_tempcov","no2_tempcov","population")
who_data[num_cols] <- lapply(who_data[num_cols], as.numeric)
# Convert categorical variables to correct type
who_data$type_of_stations <- as.character(who_data$type_of_stations)
who_data$who_region <- as.factor(who_data$who_region)
# Filter for European countries and remove pm25 column
europe_data <- who_data %>%
filter(who_region == "4_Eur") %>%
select(-pm25_concentration)
# Remove rows with missing values
clean_data_europe <- na.omit(europe_data)
summary(clean_data_europe)
## who_region iso3 country_name city
## 4_Eur :4106 Length:4106 Length:4106 Length:4106
## : 0 Class :character Class :character Class :character
## 1_Afr : 0 Mode :character Mode :character Mode :character
## 2_Amr : 0
## 3_Sear : 0
## 5_Emr : 0
## (Other): 0
## year pm10_concentration no2_concentration pm10_tempcov
## Min. :2013 Min. : 6.045 Min. : 1.052 Min. : 0.00
## 1st Qu.:2015 1st Qu.:17.479 1st Qu.: 17.651 1st Qu.: 92.00
## Median :2017 Median :20.991 Median : 23.401 Median : 96.00
## Mean :2017 Mean :22.648 Mean : 24.314 Mean : 91.17
## 3rd Qu.:2019 3rd Qu.:25.958 3rd Qu.: 29.865 3rd Qu.: 99.00
## Max. :2022 Max. :88.000 Max. :110.432 Max. :100.00
##
## pm25_tempcov no2_tempcov type_of_stations population
## Min. : 0.00 Min. : 0.00 Length:4106 Min. : 199
## 1st Qu.: 89.00 1st Qu.: 93.00 Class :character 1st Qu.: 95430
## Median : 96.00 Median : 96.00 Mode :character Median : 173985
## Mean : 88.24 Mean : 93.57 Mean : 434184
## 3rd Qu.: 98.00 3rd Qu.: 99.00 3rd Qu.: 377134
## Max. :100.00 Max. :100.00 Max. :15190336
##
## latitude longitude
## Min. :-20.89 Min. :-61.536
## 1st Qu.: 43.53 1st Qu.: 1.724
## Median : 48.52 Median : 9.187
## Mean : 47.36 Mean : 8.979
## 3rd Qu.: 51.50 3rd Qu.: 15.642
## Max. : 69.67 Max. : 55.465
##
# For categorial variables
clean_data_europe$country_name <- as.factor(clean_data_europe$country_name)
clean_data_europe$type_of_stations <- as.factor(clean_data_europe$type_of_stations)
clean_data_europe$city <- as.factor(clean_data_europe$city)
# Split data into training (2010-2019), validation (2020) and testing (2021)
train_data <- filter(clean_data_europe, year <= 2019)
valid_data <- filter(clean_data_europe, year == 2020)
test_data <- filter(clean_data_europe, year == 2021)
Support Vector Machines are supervised learning models used for both classification and regression problems. They work by finding an optimal boundary, known as a hyperplane, that best separates or predicts data points. The observations that lie closest to this boundary are called support vectors, and they are the most influential in defining the model.
There are two main types of Support Vector Machines:
- Support Vector Classification (SVC) which used when the target
variable is categorical
- Support Vector Regression (SVR) which is used when the target variable
is continuous
Since we aim to predict PM₁₀ pollutant concentrations, which are
continuous values, we use the Support Vector Regression (SVR)
model.
This model finds a smooth function that predicts PM₁₀ levels but SVR
cannot handle missing values directly, so rows containing NA’s must be
removed or imputed before training the model.
Support Vector Regression (SVR) could be useful for environmental
data because:
- It can deal with non-linear relationships.
- It is quite robust to noise and outliers.
- It can use several explanatory variables at once to improve
predictions.
Even though SVR has these advantages, we will also try other regression and machine learning models to see which one predicts PM₁₀ levels across European countries best.
We will try different kernels in SVR to see which works best for predicting PM₁₀ levels.
We will start with simple kernels and increase complexity to find the best model for PM₁₀ predictions.
In order to run our code, the following libraries are required:
# Install packages if needed
if(!require("ggplot2")) install.packages("ggplot2")
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:e1071':
##
## element
if(!require("dplyr")) install.packages("dplyr")
if(!require("e1071")) install.packages("e1071")
if(!require("ggrepel")) install.packages("ggrepel")
## Loading required package: ggrepel
if(!require("kableExtra")) install.packages("kableExtra")
## Loading required package: kableExtra
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
if(!require("plotly")) install.packages("plotly")
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
if(!require("caret")) install.packages("caret")
## Loading required package: caret
## Loading required package: lattice
if(!require("stringi")) install.packages("stringi")
## Loading required package: stringi
# We load the libraries we will need throughout the project:
library(ggplot2)
library(dplyr)
library(e1071)
library(ggrepel)
library(kableExtra)
library(plotly)
library(caret)
library(stringi)
# Fit linear SVR on training data (2010–2019)
svr_linear_model <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "linear"
)
# Predict on validation data (2020)
predictions_linear_valid <- predict(svr_linear_model, newdata = valid_data)
# Evaluate performance on validation data
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_linear_valid)) +
geom_point(color = "blue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10", title = "Linear SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
# Create UTF-8 copies for tooltips only
tooltip_city <- iconv(valid_data$city, from = "", to = "UTF-8", sub = "?")
tooltip_country <- iconv(valid_data$country_name, from = "", to = "UTF-8", sub = "?")
# Interractive plot
p_valid <- ggplot(valid_data, aes(
x = pm10_concentration,
y = predictions_linear_valid,
text = I(paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country))
)) +
geom_point(color = "blue", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
x = "Actual PM10",
y = "Predicted PM10",
title = "Linear SVR Predictions vs Actual (Validation 2020)"
) +
theme_minimal()
ggplotly(p_valid, tooltip = "text")
# Actual PM10 values for validation data
actuals_linear_valid <- valid_data$pm10_concentration
# Root Mean Squared Error (RMSE)
rmse_linear_valid <- sqrt(mean((predictions_linear_valid - actuals_linear_valid)^2))
# Mean Absolute Error (MAE)
mae_linear_valid <- mean(abs(predictions_linear_valid - actuals_linear_valid))
# R-squared (proportion of variance explained)
r_squared_linear_valid <- 1 - sum((predictions_linear_valid - actuals_linear_valid)^2) /
sum((actuals_linear_valid - mean(actuals_linear_valid))^2)
# Print all metrics
cat("RMSE (Linear SVR - Validation):", rmse_linear_valid, "\n")
## RMSE (Linear SVR - Validation): 5.399075
cat("MAE (Linear SVR - Validation):", mae_linear_valid, "\n")
## MAE (Linear SVR - Validation): 3.818252
cat("R² (Linear SVR - Validation):", r_squared_linear_valid, "\n")
## R² (Linear SVR - Validation): 0.4219859
# Fit a polynomial SVR model on training data (2010–2019)
svr_poly_model <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "polynomial",
degree = 2, # adjust depending on nonlinearity
coef0 = 1 # constant term in the polynomial kernel
)
# Predict PM10 for validation data (2020)
predictions_poly_valid <- predict(svr_poly_model, newdata = valid_data)
# Evaluate performance on validation data
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_poly_valid)) +
geom_point(color = "purple", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10", title = "Polynomial SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
# Interactive plot
p_poly_valid <- ggplot(valid_data, aes(
x = pm10_concentration,
y = predictions_poly_valid,
text = I(paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country))
)) +
geom_point(color = "purple", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(
x = "Actual PM10",
y = "Predicted PM10",
title = "Polynomial SVR Predictions vs Actual (Validation 2020)"
) +
theme_minimal()
ggplotly(p_poly_valid, tooltip = "text")
# Actual PM10 values for validation data
actuals_poly_valid <- valid_data$pm10_concentration
# Root Mean Squared Error (RMSE)
rmse_poly_valid <- sqrt(mean((predictions_poly_valid - actuals_poly_valid)^2))
# Mean Absolute Error (MAE)
mae_poly_valid <- mean(abs(predictions_poly_valid - actuals_poly_valid))
# R-squared (proportion of variance explained)
r2_poly_valid <- 1 - sum((predictions_poly_valid - actuals_poly_valid)^2) /
sum((actuals_poly_valid - mean(actuals_poly_valid))^2)
# Print all metrics
cat("RMSE (Polynomial - Validation):", rmse_poly_valid, "\n")
## RMSE (Polynomial - Validation): 4.709768
cat("MAE (Polynomial - Validation):", mae_poly_valid, "\n")
## MAE (Polynomial - Validation): 3.581697
cat("R² (Polynomial - Validation):", r2_poly_valid, "\n")
## R² (Polynomial - Validation): 0.5601561
Our model is already better than the linear one
library(e1071)
library(dplyr)
# Hyperparameter grid
grid <- expand.grid(
degree = c(2, 3, 4),
coef0 = c(0, 1, 2)
)
# Create an empty results table
results <- data.frame(
degree = integer(),
coef0 = numeric(),
RMSE = numeric(),
MAE = numeric(),
R2 = numeric()
)
# Loop through all combinations
for (i in 1:nrow(grid)) {
d <- grid$degree[i]
c0 <- grid$coef0[i]
# Fit polynomial SVR on training data
model <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "polynomial",
degree = d,
coef0 = c0
)
# Predict on validation data
pred_valid <- predict(model, newdata = valid_data)
actuals <- valid_data$pm10_concentration
# Compute metrics
rmse <- sqrt(mean((pred_valid - actuals)^2))
mae <- mean(abs(pred_valid - actuals))
r2 <- 1 - sum((pred_valid - actuals)^2) / sum((actuals - mean(actuals))^2)
# Store results in table
results <- rbind(results, data.frame(
degree = d,
coef0 = c0,
RMSE = rmse,
MAE = mae,
R2 = r2
))
}
# Arrange by RMSE ascending
results <- results %>% arrange(RMSE)
# Show the table
print(results)
## degree coef0 RMSE MAE R2
## 1 3 2 4.678480 3.421604 0.5659805
## 2 3 1 4.678512 3.420927 0.5659746
## 3 2 1 4.709768 3.581697 0.5601561
## 4 2 2 4.709824 3.581366 0.5601456
## 5 4 1 5.418156 3.638885 0.4178932
## 6 4 2 5.447603 3.637149 0.4115488
## 7 4 0 6.006909 4.354437 0.2845130
## 8 2 0 6.330398 5.145282 0.2053761
## 9 3 0 7.990291 5.482813 -0.2659739
# Fit polynomial SVR on training data (2010–2019) using best hyperparameters
svr_poly_model_best <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "polynomial",
degree = 3, # best from validation
coef0 = 2 # best from validation
)
# Predict PM10 for validation data (2020)
predictions_poly_valid_best <- predict(svr_poly_model_best, newdata = valid_data)
# Static plot
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_poly_valid_best)) +
geom_point(color = "purple", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10",
title = "Polynomial SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
# Interactive plot with tooltips
p_poly_valid <- ggplot(valid_data, aes(
x = pm10_concentration,
y = predictions_poly_valid_best,
text = paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country)
)) +
geom_point(color = "purple", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10",
title = "Polynomial SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
ggplotly(p_poly_valid, tooltip = "text")
# Actual PM10 values for validation data
actuals_poly_valid_best <- valid_data$pm10_concentration
# Root Mean Squared Error (RMSE)
rmse_poly_valid_best <- sqrt(mean((predictions_poly_valid_best - actuals_poly_valid_best)^2))
# Mean Absolute Error (MAE)
mae_poly_valid_best <- mean(abs(predictions_poly_valid_best - actuals_poly_valid_best))
# R-squared (proportion of variance explained)
r2_poly_valid_best <- 1 - sum((predictions_poly_valid_best - actuals_poly_valid_best)^2) /
sum((actuals_poly_valid_best - mean(actuals_poly_valid_best))^2)
# Print all metrics
cat("RMSE (Polynomial - Validation - Best):", rmse_poly_valid_best, "\n")
## RMSE (Polynomial - Validation - Best): 4.67848
cat("MAE (Polynomial - Validation - Best):", mae_poly_valid_best, "\n")
## MAE (Polynomial - Validation - Best): 3.421604
cat("R² (Polynomial - Validation - Best):", r2_poly_valid_best, "\n")
## R² (Polynomial - Validation - Best): 0.5659805
# libraries used
#library(e1071)
#library(ggplot2)
#library(plotly)
#library(stringi)
# Create UTF-8 copies for tooltips only
tooltip_city <- iconv(valid_data$city, from = "", to = "UTF-8", sub = "?")
tooltip_country <- iconv(valid_data$country_name, from = "", to = "UTF-8", sub = "?")
# Fit RBF SVR on training data (2013–2019)
svr_rbf_model <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "radial",
gamma = 0.01, # example
cost = 1 # example
)
# Predict PM10 for validation data (2020)
predictions_rbf_valid <- predict(svr_rbf_model, newdata = valid_data)
# Static plot
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_rbf_valid)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10",
title = "RBF SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
# Interactive plot with tooltips
p_rbf_valid <- ggplot(valid_data, aes(
x = pm10_concentration,
y = predictions_rbf_valid,
text = paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country)
)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10",
title = "RBF SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
ggplotly(p_rbf_valid, tooltip = "text")
# Compute validation metrics
actuals_rbf_valid <- valid_data$pm10_concentration
rmse_rbf_valid <- sqrt(mean((predictions_rbf_valid - actuals_rbf_valid)^2))
mae_rbf_valid <- mean(abs(predictions_rbf_valid - actuals_rbf_valid))
r2_rbf_valid <- 1 - sum((predictions_rbf_valid - actuals_rbf_valid)^2) /
sum((actuals_rbf_valid - mean(actuals_rbf_valid))^2)
cat("RMSE (RBF - Validation):", rmse_rbf_valid, "\n")
## RMSE (RBF - Validation): 4.529935
cat("MAE (RBF - Validation):", mae_rbf_valid, "\n")
## MAE (RBF - Validation): 3.497689
cat("R2 (RBF - Validation):", r2_rbf_valid, "\n")
## R2 (RBF - Validation): 0.5931039
This is already better than polynomial kernel
##Hyperparameter Tuning for the RBF Kernel
# Hyperparameter grid for RBF kernel
grid_rbf <- expand.grid(
gamma = c(0.01, 0.1, 0.5),
cost = c(1, 5, 10)
)
# Create an empty results table
results_rbf <- data.frame(
gamma = numeric(),
cost = numeric(),
RMSE = numeric(),
MAE = numeric(),
R2 = numeric()
)
# Loop through all combinations
for (i in 1:nrow(grid_rbf)) {
g <- grid_rbf$gamma[i]
k <- grid_rbf$cost[i]
# Fit RBF SVR on training data
model <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "radial",
gamma = g,
cost = k
)
# Predict on validation data
pred_valid <- predict(model, newdata = valid_data)
actuals <- valid_data$pm10_concentration
# Compute metrics
rmse <- sqrt(mean((pred_valid - actuals)^2))
mae <- mean(abs(pred_valid - actuals))
r2 <- 1 - sum((pred_valid - actuals)^2) / sum((actuals - mean(actuals))^2)
# Store results
results_rbf <- rbind(results_rbf, data.frame(
gamma = g,
cost = k,
RMSE = rmse,
MAE = mae,
R2 = r2
))
}
# Arrange by RMSE ascending
results_rbf <- results_rbf %>% arrange(RMSE)
# Show the table
print(results_rbf)
## gamma cost RMSE MAE R2
## 1 0.10 1 4.375005 3.267978 0.6204607
## 2 0.10 5 4.396605 3.126451 0.6167037
## 3 0.01 10 4.425666 3.400929 0.6116199
## 4 0.01 5 4.441561 3.418860 0.6088253
## 5 0.10 10 4.506137 3.130237 0.5973679
## 6 0.01 1 4.529935 3.497689 0.5931039
## 7 0.50 1 4.734048 3.441449 0.5556092
## 8 0.50 5 4.859162 3.578828 0.5318098
## 9 0.50 10 4.898298 3.636437 0.5242377
# Best RMSE
best_rmse <- results_rbf %>% slice_min(RMSE, n = 1)
cat("Best RMSE:\n")
## Best RMSE:
print(best_rmse)
## gamma cost RMSE MAE R2
## 1 0.1 1 4.375005 3.267978 0.6204607
# Best MAE
best_mae <- results_rbf %>% slice_min(MAE, n = 1)
cat("\nBest MAE:\n")
##
## Best MAE:
print(best_mae)
## gamma cost RMSE MAE R2
## 1 0.1 5 4.396605 3.126451 0.6167037
# Best R² (highest)
best_r2 <- results_rbf %>% slice_max(R2, n = 1)
cat("\nBest R²:\n")
##
## Best R²:
print(best_r2)
## gamma cost RMSE MAE R2
## 1 0.1 1 4.375005 3.267978 0.6204607
# Fit RBF SVR on training data (2010–2019) with best hyperparameters
svr_rbf_model_best <- svm(
pm10_concentration ~ year * no2_concentration + population + latitude + longitude,
data = train_data,
type = "eps-regression",
kernel = "radial",
gamma = 0.1, # best gamma from validation
cost = 1 # best cost from validation
)
# Predict PM10 for validation data (2020) using best model
predictions_rbf_valid_best <- predict(svr_rbf_model_best, newdata = valid_data)
# Static plot
ggplot(valid_data, aes(x = pm10_concentration, y = predictions_rbf_valid_best)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10",
title = "RBF SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
# Interactive plot with tooltips
p_rbf_valid_best <- ggplot(valid_data, aes(
x = pm10_concentration,
y = predictions_rbf_valid_best,
text = paste0("City: ", tooltip_city, "<br>Country: ", tooltip_country)
)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "red") +
labs(x = "Actual PM10", y = "Predicted PM10",
title = "RBF SVR Predictions vs Actual (Validation 2020)") +
theme_minimal()
ggplotly(p_rbf_valid_best, tooltip = "text")
# Actual PM10 values for validation data
actuals_rbf_valid_best <- valid_data$pm10_concentration
# Root Mean Squared Error (RMSE)
rmse_rbf_valid_best <- sqrt(mean((predictions_rbf_valid_best - actuals_rbf_valid_best)^2))
# Mean Absolute Error (MAE)
mae_rbf_valid_best <- mean(abs(predictions_rbf_valid_best - actuals_rbf_valid_best))
# R-squared (proportion of variance explained)
r2_rbf_valid_best <- 1 - sum((predictions_rbf_valid_best - actuals_rbf_valid_best)^2) /
sum((actuals_rbf_valid_best - mean(actuals_rbf_valid_best))^2)
# Print all metrics
cat("RMSE (RBF - Validation - Best):", rmse_rbf_valid_best, "\n")
## RMSE (RBF - Validation - Best): 4.375005
cat("MAE (RBF - Validation - Best):", mae_rbf_valid_best, "\n")
## MAE (RBF - Validation - Best): 3.267978
cat("R² (RBF - Validation - Best):", r2_rbf_valid_best, "\n")
## R² (RBF - Validation - Best): 0.6204607